library(ggplot2)
library(scales)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-0. For overview type 'help("mgcv-package")'.
library(maps)
library(mapdata)
transfer_data <- read.csv("merged_cleaned_data.csv")
head(transfer_data)
## club name age nationality position
## 1 Arsenal FC Alex Oxlade-Chamberlain 17 England Central Midfield
## 2 Arsenal FC Gervinho 24 Cote d'Ivoire Left Winger
## 3 Arsenal FC Mikel Arteta 29 Spain Central Midfield
## 4 Arsenal FC Per Mertesacker 26 Germany Centre-Back
## 5 Arsenal FC André Santos 28 Brazil Attacking Midfield
## 6 Arsenal FC Chu-young Park 26 Korea, South Centre-Forward
## short_pos market_value dealing_club dealing_country fee movement window
## 1 CM 2500000 Southampton England 13800000 in summer
## 2 LW 15000000 LOSC Lille France 12000000 in summer
## 3 CM 15000000 Everton England 12000000 in summer
## 4 CB 12500000 Werder Bremen Germany 11300000 in summer
## 5 AM 8500000 Fenerbahce Turkey 7000000 in summer
## 6 CF 7000000 Monaco France 6500000 in summer
## league season is_loan loan_status
## 1 Premier League 2011 False Not Applicable
## 2 Premier League 2011 False Not Applicable
## 3 Premier League 2011 False Not Applicable
## 4 Premier League 2011 False Not Applicable
## 5 Premier League 2011 False Not Applicable
## 6 Premier League 2011 False Not Applicable
str(transfer_data)
## 'data.frame': 17880 obs. of 16 variables:
## $ club : chr "Arsenal FC" "Arsenal FC" "Arsenal FC" "Arsenal FC" ...
## $ name : chr "Alex Oxlade-Chamberlain" "Gervinho" "Mikel Arteta" "Per Mertesacker" ...
## $ age : num 17 24 29 26 28 26 19 19 24 24 ...
## $ nationality : chr "England" "Cote d'Ivoire" "Spain" "Germany" ...
## $ position : chr "Central Midfield" "Left Winger" "Central Midfield" "Centre-Back" ...
## $ short_pos : chr "CM" "LW" "CM" "CB" ...
## $ market_value : num 2500000 15000000 15000000 12500000 8500000 ...
## $ dealing_club : chr "Southampton" "LOSC Lille" "Everton" "Werder Bremen" ...
## $ dealing_country: chr "England" "France" "England" "Germany" ...
## $ fee : int 13800000 12000000 12000000 11300000 7000000 6500000 1130000 1000000 34000000 27500000 ...
## $ movement : chr "in" "in" "in" "in" ...
## $ window : chr "summer" "summer" "summer" "summer" ...
## $ league : chr "Premier League" "Premier League" "Premier League" "Premier League" ...
## $ season : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ is_loan : chr "False" "False" "False" "False" ...
## $ loan_status : chr "Not Applicable" "Not Applicable" "Not Applicable" "Not Applicable" ...
unique_positions <- unique(transfer_data$short_pos)
print(unique_positions)
## [1] "CM" "LW" "CB" "AM" "CF" "RB"
## [7] "LB" "RW" "GK" "LM" "DM" "RM"
## [13] "SS" "defence" "midfield" "attack"
transfer_data <- transfer_data %>%
mutate(position_category = case_when(
short_pos %in% c("CM", "LM", "AM", "DM", "RM") ~ "midfield",
short_pos %in% c("RB", "LB", "CB") ~ "defence",
short_pos %in% c("SS", "LW", "RW", "CF") ~ "attack",
TRUE ~ short_pos
))
head(transfer_data)
## club name age nationality position
## 1 Arsenal FC Alex Oxlade-Chamberlain 17 England Central Midfield
## 2 Arsenal FC Gervinho 24 Cote d'Ivoire Left Winger
## 3 Arsenal FC Mikel Arteta 29 Spain Central Midfield
## 4 Arsenal FC Per Mertesacker 26 Germany Centre-Back
## 5 Arsenal FC André Santos 28 Brazil Attacking Midfield
## 6 Arsenal FC Chu-young Park 26 Korea, South Centre-Forward
## short_pos market_value dealing_club dealing_country fee movement window
## 1 CM 2500000 Southampton England 13800000 in summer
## 2 LW 15000000 LOSC Lille France 12000000 in summer
## 3 CM 15000000 Everton England 12000000 in summer
## 4 CB 12500000 Werder Bremen Germany 11300000 in summer
## 5 AM 8500000 Fenerbahce Turkey 7000000 in summer
## 6 CF 7000000 Monaco France 6500000 in summer
## league season is_loan loan_status position_category
## 1 Premier League 2011 False Not Applicable midfield
## 2 Premier League 2011 False Not Applicable attack
## 3 Premier League 2011 False Not Applicable midfield
## 4 Premier League 2011 False Not Applicable defence
## 5 Premier League 2011 False Not Applicable midfield
## 6 Premier League 2011 False Not Applicable attack
unique_positions <- unique(transfer_data$position_category)
print(unique_positions)
## [1] "midfield" "attack" "defence" "GK"
filtered_data <- transfer_data[transfer_data$fee > 0, ]
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]
ggplot(transfer_data_2021, aes(x = market_value, y = fee, color = league)) +
geom_point() +
labs(title = "Relationship between Market Value and Transfer Fee (All Leagues - 2021)",
x = "Market Value",
y = "Transfer Fee",
color = "League")
ggplot(transfer_data, aes(x = age, fill = ..count..)) +
geom_histogram(binwidth = 1, color = "black", alpha = 0.7) +
scale_fill_gradient("Count", low = "yellow", high = "purple") +
labs(title = "Transfer Distribution by Age from 2011 to 2020",
x = "Age",
y = "Count") +
theme_minimal()
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(transfer_data, aes(x = age, y = fee, color = league)) +
geom_point(alpha = 0.7) +
#scale_color_manual(values = league_colors, name = "League") +
labs(title = "Transfer Fees versus Age",
x = "Age",
y = "Transfer Fee",
color = "League") +
theme_minimal()
# x_limits =c(0,12500000)
# y_limits =c(0,15000000)
## faceted by league
ggplot(filtered_data, aes(x = market_value, y = fee)) +
geom_point() +
labs(title = "Relationship between Market Value and Transfer Fee (All Leagues)",
x = "Market Value",
y = "Transfer Fee") + facet_wrap(~ league)
ggplot(filtered_data, aes(x = log(market_value), y = log(fee))) +
geom_point() +
labs(title = "Relationship between log(Market Value) and log(Transfer Fee) (All Leagues)",
x = "Market Value",
y = "Transfer Fee") +
facet_wrap(~ league, scales = "free")
ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
geom_point(aes(group = position_category), size = 1) +
#geom_point(size = 1) +
labs(
title = "Scatter Plot - Market Value and Position on Transfer Fees",
x = "Market Value",
y = "Transfer Fee",
color = "Position"
) +
facet_wrap(~position_category)
##Log Scale
ggplot(filtered_data, aes(x = log(market_value), y = log(fee), color = position_category)) +
geom_point(aes(group = position_category), size = 1) +
#geom_point(size = 1) +
labs(
title = "Scatter Plot - log(Market Value) and Position on log(Transfer Fees)",
x = "Market Value",
y = "Transfer Fee",
color = "Position"
) +
facet_wrap(~position_category)
# Filter the data for the year 2021 and exclude players with a fee of 0
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]
# Scatter plot with color-coded points for each league in the year 2021
ggplot(transfer_data, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add a linear regression line
labs(title = "Relationship between Market Value and Transfer Fee (All Leagues)",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
## `geom_smooth()` using formula = 'y ~ x'
# Create a new variable for year categories
filtered_data$year_category <- cut(filtered_data$season, breaks = c(2011, 2014, 2018, 2021), labels = c("2011-2014", "2015-2018", "2019-2021"))
# Filter out rows with NA in year_category
filtered_data <- filtered_data[!is.na(filtered_data$year_category), ]
# Visualize the scatter plot with categorized years
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
labs(title = "Market Value vs Transfer Fees - Categorized Years",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ year_category)
# Filter the data for the year 2021 and exclude players with a fee of 0
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]
# Scatter plot with color-coded points for each league in the year 2021
ggplot(transfer_data_2021, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Add a linear regression line
labs(title = "Relationship between Market Value and Transfer Fee (All League) 2021",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
## `geom_smooth()` using formula = 'y ~ x'
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]
# Define unique colors for each league
league_colors <- c("Premier League" = "red", "Laliga" = "blue", "1 Bundesliga" = "green", "Serie A" = "purple", "Ligue 1" = "orange")
# Scatter plot with color-coded points and separate trend lines for each league
ggplot(transfer_data_2021, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, aes(group = league, color = league)) + # Separate trend lines for each league with different colors
scale_color_manual(values = league_colors, name = "League") + # Set colors according to league
labs(title = "Relationship between Market Value and Transfer Fee (All Leagues - 2021)",
x = "Market Value",
y = "Transfer Fee",
color = "League")
## `geom_smooth()` using formula = 'y ~ x'
create_scatter_plot <- function(data, title) {
ggplot(data, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, aes(group = league, color = league)) +
scale_color_manual(values = league_colors, name = "League") +
labs(title = title,
x = "Market Value",
y = "Transfer Fee",
color = "League")
}
transfer_data_2021 <- transfer_data[transfer_data$season == "2021" & transfer_data$fee > 0, ]
plot_2021 <- create_scatter_plot(transfer_data_2021, "Relationship between Market Value and Transfer Fee (All Leagues - 2021)")
transfer_data_2016 <- transfer_data[transfer_data$season == "2016" & transfer_data$fee > 0, ]
plot_2016 <- create_scatter_plot(transfer_data_2016, "Relationship between Market Value and Transfer Fee (All Leagues - 2016)")
transfer_data_2011 <- transfer_data[transfer_data$season == "2011" & transfer_data$fee > 0, ]
plot_2011 <- create_scatter_plot(transfer_data_2011, "Relationship between Market Value and Transfer Fee (All Leagues - 2011)")
par(mfrow = c(1, 3))
plot_2021
## `geom_smooth()` using formula = 'y ~ x'
plot_2016
## `geom_smooth()` using formula = 'y ~ x'
plot_2011
## `geom_smooth()` using formula = 'y ~ x'
# Calculate the correlation coefficient
correlation_coefficient <- cor(filtered_data$market_value, filtered_data$fee)
# Print the correlation coefficient
cat("Correlation Coefficient:", correlation_coefficient, "\n")
## Correlation Coefficient: 0.8706092
Using filtered_data:
model <- lm(fee ~ market_value + league, data = filtered_data)
# Plot
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_abline(intercept = coef(model)[1], slope = coef(model)[2], color = "blue") +
labs(title = "Relationship between Market Value and Transfer Fee (All League)",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
model2 <- lm(fee ~ market_value + short_pos, data = filtered_data)
# Plot
ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
geom_point() +
geom_abline(intercept = coef(model)[1], slope = coef(model)[2], color = "blue") +
labs(title = "Relationship between Market Value and Transfer Fee (All League)",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
model1 <- lm(fee ~ market_value + short_pos + league + season, data = filtered_data)
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) + # Add a linear regression line
labs(title = "Relationship between Market Value and Transfer Fee (Model)",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
MODEL WE CHOSE
Linear Regression with Age Categories
# Create age categories
filtered_data$age_category <- cut(filtered_data$age, breaks = c(18, 21, 25, 30, 35, 40, 45, 50, 100),
labels = c("18-21", "22-25", "26-30", "31-35", "36-40", "41-45", "46-50", "50+"))
# Fit a linear regression model with age categories
model_age_cat <- lm(fee ~ market_value + short_pos + league + season + as.factor(age_category), data = filtered_data)
# Visualize the relationship
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
labs(title = "Relationship between Market Value and Transfer Fee (Model with Age Categories)",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
ANALYSIS OF CHOSEN MODEL
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
vif(model_age_cat)
## GVIF Df GVIF^(1/(2*Df))
## market_value 1.111339 1 1.054201
## short_pos 1.106947 15 1.003393
## league 1.115621 4 1.013770
## season 1.061856 1 1.030464
## as.factor(age_category) 1.092133 4 1.011077
# Add predictions and actual values to the dataset
filtered_data$predictions_age_cat <- predict(model_age_cat, newdata = filtered_data)
filtered_data$actual_values <- filtered_data$fee
# Visualize predictions vs. actual values
ggplot(filtered_data, aes(x = actual_values, y = predictions_age_cat, color = league)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Predictions vs. Actual Transfer Fees (Model with Age Categories)",
x = "Actual Transfer Fee",
y = "Predicted Transfer Fee",
color = "League")
## Warning: Removed 326 rows containing missing values (`geom_point()`).
residuals_age_cat <- residuals(model_age_cat)
# Assuming log transformation is needed for market_value
filtered_data$log_market_value <- log(filtered_data$market_value)
# Ensure both data frames have the same number of rows
min_rows <- min(nrow(filtered_data), length(residuals_age_cat))
filtered_data <- head(filtered_data, min_rows)
residuals_data <- data.frame(
log_market_value = filtered_data$log_market_value,
residuals = head(residuals_age_cat, min_rows),
league = filtered_data$league
)
# Visualize residuals
ggplot(residuals_data, aes(x = log_market_value, y = residuals, color = league)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Log-transformed Market Value (Model with Age Categories)",
x = "Log-transformed Market Value",
y = "Residuals",
color = "League") +
facet_grid(league ~ .)
# Predict transfer fees
predictions_age_cat <- predict(model_age_cat, newdata = filtered_data)
# Evaluate accuracy
accuracy <- sqrt(mean((filtered_data$fee - predictions_age_cat)^2))
print(paste("Root Mean Squared Error (RMSE):", round(accuracy, 2)))
## [1] "Root Mean Squared Error (RMSE): NA"
# Check for missing values
any(is.na(filtered_data$fee))
## [1] FALSE
any(is.na(predictions_age_cat))
## [1] TRUE
# Check unique values and missing values in age_category
table(filtered_data$age_category, useNA = "always")
##
## 18-21 22-25 26-30 31-35 36-40 41-45 46-50 50+ <NA>
## 2156 3747 2765 612 22 0 0 0 320
# Remove rows with missing age_category
filtered_data <- na.omit(filtered_data, cols = "age_category")
# Predict transfer fees
filtered_data$predictions_age_cat <- predict(model_age_cat, newdata = filtered_data)
# Evaluate accuracy
accuracy <- sqrt(mean((filtered_data$fee - filtered_data$predictions_age_cat)^2))
print(paste("Root Mean Squared Error (RMSE):", round(accuracy, 2)))
## [1] "Root Mean Squared Error (RMSE): 5715280.17"
ANALYSIS USING PREDICTED VALUES
Positional Analysis using position_category:
# Create a summary table for average predicted transfer fees by position category
position_category_summary <- aggregate(predictions_age_cat ~ position_category, data = filtered_data, FUN = mean)
Comparative Analysis for Position Categories:
library(ggplot2)
ggplot(filtered_data, aes(x = position_category, y = predictions_age_cat, fill = position_category)) +
geom_boxplot() +
labs(title = "Comparison of Predicted Transfer Fees by Position Category",
x = "Position Category",
y = "Predicted Transfer Fee",
fill = "Position Category")
Positional Trends Over Time by Category:
ggplot(filtered_data, aes(x = season, y = predictions_age_cat, color = position_category)) +
geom_line() +
labs(title = "Position Category Trends in Predicted Transfer Fees Over Time",
x = "Season",
y = "Predicted Transfer Fee",
color = "Position Category")
Positional Influence in Different Leagues by Category:
ggplot(filtered_data, aes(x = position_category, y = predictions_age_cat, color = league)) +
geom_point() +
facet_wrap(~league) +
labs(title = "Impact of Position Category on Predicted Transfer Fees by League",
x = "Position Category",
y = "Predicted Transfer Fee",
color = "League")
Top Value Position Categories:
# Identify top position categories by average predicted transfer fees
top_position_categories <- position_category_summary[order(position_category_summary$predictions, decreasing = TRUE), ]
top_position_categories
## position_category predictions_age_cat
## 1 attack 8602416
## 4 midfield 7051186
## 2 defence 5901163
## 3 GK 4554635
Club Analysis using Club:
# Create a summary table for average predicted transfer fees by club
club_summary <- aggregate(predictions_age_cat ~ club, data = filtered_data, FUN = mean)
Top Value Clubs: which football clubs, on average, tend to have higher predicted transfer fees:
# Identify top clubs by average predicted transfer fees
top_clubs <- club_summary[order(club_summary$predictions, decreasing = TRUE), ]
top_clubs
## club predictions_age_cat
## 28 Brentford FC 35321225.9
## 92 Manchester United 23203024.6
## 57 FC Barcelona 23135562.2
## 40 Chelsea FC 20839961.4
## 113 Real Madrid 20783452.9
## 101 Paris Saint-Germain 19296946.0
## 85 Leeds United 18922673.6
## 23 Bayern Munich 18232087.0
## 21 Atlético Madrid 17758314.7
## 91 Manchester City 16944361.5
## 12 Arsenal FC 16609382.5
## 88 Liverpool FC 16188908.0
## 144 Tottenham Hotspur 14796638.7
## 26 Borussia Dortmund 14701303.8
## 55 Everton FC 14365181.5
## 86 Leicester City 12751318.9
## 95 Newcastle United 12438494.6
## 14 AS Monaco 12437859.7
## 163 Wolverhampton Wanderers 12072863.8
## 20 Athletic Bilbao 11504204.2
## 106 RB Leipzig 11350145.0
## 127 Southampton FC 10911381.5
## 153 Valencia CF 10457774.4
## 124 Sevilla FC 10272890.9
## 44 Crystal Palace 10151282.3
## 84 Juventus FC 9911073.0
## 114 Real Sociedad 9891309.7
## 161 West Ham United 9866030.7
## 7 AC Milan 9603055.0
## 99 Olympique Lyon 9400545.9
## 18 Aston Villa 9043756.0
## 22 Bayer 04 Leverkusen 8382062.6
## 133 SSC Napoli 8351613.5
## 83 Inter Milan 8302543.1
## 158 Villarreal CF 8295922.4
## 100 Olympique Marseille 8290986.6
## 72 Fulham FC 8153811.3
## 16 AS Roma 8126518.4
## 104 Queens Park Rangers 8094089.2
## 159 Watford FC 7997975.1
## 125 Sheffield United 7682641.8
## 112 Real Betis Balompié 7628297.0
## 142 Swansea City 7557900.4
## 107 RC Lens 7396897.1
## 31 Burnley FC 7384386.0
## 30 Brighton & Hove Albion 7215347.8
## 162 Wigan Athletic 7167283.6
## 89 LOSC Lille 7038928.0
## 11 Arminia Bielefeld 7022720.0
## 138 Stoke City 6844941.8
## 157 VfL Wolfsburg 6774211.4
## 27 Borussia Mönchengladbach 6632234.9
## 139 Sunderland AFC 6622662.9
## 160 West Bromwich Albion 6397311.3
## 135 Stade Brestois 29 6388202.4
## 9 AFC Bournemouth 6299866.7
## 82 Hull City 6241744.7
## 36 Cardiff City 6206134.1
## 63 FC Internazionale 6034000.6
## 67 FC Schalke 04 5981894.6
## 109 RCD Espanyol Barcelona 5630434.0
## 51 Eintracht Frankfurt 5609588.4
## 80 Hertha BSC 5601014.8
## 3 1.FC Union Berlin 5564262.9
## 8 ACF Fiorentina 5563297.6
## 81 Huddersfield Town 5561150.7
## 97 Norwich City 5510506.1
## 93 Middlesbrough FC 5462336.4
## 54 ESTAC Troyes 5433074.8
## 61 FC Girondins Bordeaux 5292292.6
## 148 UD Las Palmas 5138195.3
## 137 Stade Rennais FC 5100938.7
## 98 OGC Nice 4997094.3
## 94 Montpellier HSC 4935582.3
## 90 Málaga CF 4785305.9
## 39 Celta de Vigo 4691938.1
## 132 SS Lazio 4670241.4
## 111 Reading FC 4612114.7
## 145 TSG 1899 Hoffenheim 4585713.4
## 146 UC Sampdoria 4488508.8
## 69 FC Toulouse 4462522.2
## 19 Atalanta BC 4291105.9
## 17 AS Saint-Étienne 4244453.3
## 156 VfL Bochum 4232146.2
## 77 Hamburger SV 3936806.8
## 38 CD Leganés 3903586.7
## 152 US Sassuolo 3890651.9
## 136 Stade Reims 3824004.1
## 108 RC Strasbourg Alsace 3816513.4
## 46 Deportivo Alavés 3785342.9
## 128 SPAL 3753044.3
## 10 Amiens SC 3720518.3
## 105 Rayo Vallecano 3589815.2
## 15 AS Nancy-Lorraine 3551603.5
## 68 FC Sochaux-Montbéliard 3471259.9
## 119 SC Freiburg 3458993.9
## 110 RCD Mallorca 3446520.2
## 66 FC Nantes 3441685.8
## 65 FC Metz 3418152.0
## 87 Levante UD 3416781.4
## 96 Nîmes Olympique 3379720.4
## 143 Torino FC 3376155.5
## 47 Deportivo de La Coruña 3370731.6
## 149 Udinese Calcio 3348998.2
## 115 Real Valladolid CF 3329160.5
## 134 Stade Brest 29 3323282.5
## 64 FC Lorient 3279134.4
## 73 Genoa CFC 3268296.1
## 76 Granada CF 3254832.1
## 122 SD Eibar 3254778.8
## 121 SCO Angers 3213089.5
## 141 SV Werder Bremen 3191723.7
## 74 Getafe CF 3175509.9
## 34 Cagliari Calcio 3164896.4
## 4 1.FSV Mainz 05 3138964.2
## 1 1. FC Köln 3119197.5
## 154 Valenciennes FC 3112111.0
## 25 Bologna FC 1909 3046718.0
## 48 Dijon FCO 2863311.6
## 42 Clermont Foot 63 2730241.6
## 78 Hannover 96 2712035.5
## 126 SM Caen 2668148.3
## 151 US Palermo 2604267.2
## 2 1.FC Nuremberg 2554642.6
## 155 VfB Stuttgart 2533119.3
## 35 Calcio Catania 2485265.9
## 60 FC Évian Thonon Gaillard 2431861.3
## 118 SC Bastia 2352141.2
## 75 Girona FC 2342926.5
## 59 FC Empoli 2319020.6
## 56 FC Augsburg 2259524.0
## 49 EA Guingamp 2138078.3
## 52 Elche CF 2092230.1
## 53 ES Troyes AC 2039228.4
## 70 Fortuna Düsseldorf 2011225.2
## 32 CA Osasuna 1924490.5
## 79 Hellas Verona 1904339.4
## 45 Delfino Pescara 1936 1772364.9
## 33 Cádiz CF 1747762.6
## 123 SD Huesca 1739582.1
## 147 UD AlmerÃa 1699063.5
## 117 Robur Siena 1698947.9
## 41 Chievo Verona 1687378.8
## 103 Parma FC 1668484.7
## 116 Real Zaragoza 1556344.1
## 130 Sporting Gijón 1418699.2
## 58 FC Crotone 1383945.5
## 62 FC Ingolstadt 04 1326222.0
## 24 Benevento Calcio 1257524.7
## 29 Brescia Calcio 1142497.2
## 102 Parma Calcio 1913 1139757.7
## 120 SC Paderborn 07 1009378.1
## 6 AC Cesena 956585.8
## 50 Eintracht Braunschweig 861778.9
## 37 Carpi FC 1909 859373.7
## 71 Frosinone Calcio 741295.0
## 129 SPAL 2013 737403.8
## 13 AS Livorno 691245.1
## 140 SV Darmstadt 98 635479.7
## 131 SpVgg Greuther Fürth 634340.9
## 43 Córdoba CF 551969.2
## 5 AC Ajaccio 463712.7
## 150 US Lecce 422402.0
Nationality Analysis using Nationality:
# Create a summary table for average predicted transfer fees by nationality
nationality_summary <- aggregate(predictions_age_cat ~ nationality, data = filtered_data, FUN = mean)
library(ggplot2)
# Filter the top 10 nationalities
top_nationalities <- nationality_summary %>%
top_n(10, wt = predictions_age_cat)
# Bar chart for the top 10 nationalities
ggplot(top_nationalities, aes(x = reorder(nationality, predictions_age_cat), y = predictions_age_cat)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 10 Nationalities - Predicted Transfer Fees",
x = "Nationality",
y = "Predicted Transfer Fee")
# World map
# Create a world map
world_map <- map_data("world")
library(maps)
ggplot(filtered_data, aes(map_id = nationality)) +
geom_map(aes(fill = predictions_age_cat), map = world_map) +
expand_limits(x = world_map$long, y = world_map$lat) +
theme_void() +
labs(title = "Predicted Transfer Fees by Nationality on World Map",
fill = "Predicted Transfer Fee")
# Identify the top nationalities by average predicted transfer fees
top_nationalities <- head(nationality_summary[order(-nationality_summary$predictions), ], 10)
# Print the top nationalities
print(top_nationalities)
## nationality predictions_age_cat
## 5 Armenia 25573507
## 19 Burundi 17992459
## 11 Belgium 15044088
## 68 Kenya 14196397
## 100 Portugal 13603025
## 40 Egypt 13354175
## 37 Dominican Republic 12847806
## 23 Central African Republic 11617273
## 47 Gabon 11512605
## 27 Colombia 10986528
Quantifying Difference Between Leagues Over Time
# Linear regression model with league, season, and league:season interaction
model3 <- lm(fee ~ market_value + short_pos + league * season, data = filtered_data)
# Visualize the league differences over time
ggplot(filtered_data, aes(x = season, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
labs(title = "Difference in Transfer Fees Between Leagues Over Time",
x = "Season",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
Adding Interaction Terms
# Model with interaction terms
model_interaction <- lm(fee ~ market_value * short_pos * league * season * as.factor(age_category), data = filtered_data)
# Visualization
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
labs(title = "Model with Interaction Terms",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
Interaction Model - league:season interaction and log-transformed variables
# Log-transform fee and market value
filtered_data$log_fee <- log(filtered_data$fee)
filtered_data$log_market_value <- log(filtered_data$market_value)
# Model with league:season interaction and log-transformed variables
interaction_model <- lm(log_fee ~ log_market_value + short_pos + league * season, data = filtered_data)
# Visualization
ggplot(filtered_data, aes(x = log_market_value, y = log_fee, color = league)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +
labs(title = "Relationship between Log-transformed Market Value and Log-transformed Transfer Fee",
x = "Log-transformed Market Value",
y = "Log-transformed Transfer Fee",
color = "League") +
facet_grid(league ~ .)
# Check model summary
summary(interaction_model)
##
## Call:
## lm(formula = log_fee ~ log_market_value + short_pos + league *
## season, data = filtered_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7435 -0.4929 0.0859 0.6103 3.9235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.683e+02 1.927e+01 -8.734 < 2e-16 ***
## log_market_value 6.265e-01 7.378e-03 84.913 < 2e-16 ***
## short_posattack 1.002e+00 3.754e-01 2.670 0.007590 **
## short_posCB -6.446e-02 4.912e-02 -1.312 0.189443
## short_posCF 5.982e-02 4.871e-02 1.228 0.219398
## short_posCM -3.412e-02 5.124e-02 -0.666 0.505504
## short_posdefence 1.665e+00 5.705e-01 2.919 0.003525 **
## short_posDM -1.457e-01 5.513e-02 -2.643 0.008227 **
## short_posGK -4.432e-02 5.894e-02 -0.752 0.452138
## short_posLB -6.524e-02 5.817e-02 -1.122 0.262082
## short_posLM -5.942e-02 1.009e-01 -0.589 0.556064
## short_posLW -4.285e-02 5.783e-02 -0.741 0.458729
## short_posmidfield 8.839e-01 3.009e-01 2.938 0.003311 **
## short_posRB -1.277e-01 5.801e-02 -2.202 0.027713 *
## short_posRM 1.992e-01 1.183e-01 1.685 0.092109 .
## short_posRW 5.238e-02 5.822e-02 0.900 0.368253
## short_posSS 1.372e-01 8.991e-02 1.526 0.127116
## leagueLaliga 9.565e+01 2.799e+01 3.418 0.000634 ***
## leagueLigue 1 8.852e+01 2.679e+01 3.304 0.000957 ***
## leaguePremier League 9.584e+01 2.620e+01 3.658 0.000255 ***
## leagueSerie A 1.675e+02 2.419e+01 6.925 4.66e-12 ***
## season 8.615e-02 9.559e-03 9.012 < 2e-16 ***
## leagueLaliga:season -4.733e-02 1.388e-02 -3.410 0.000653 ***
## leagueLigue 1:season -4.381e-02 1.329e-02 -3.297 0.000981 ***
## leaguePremier League:season -4.725e-02 1.299e-02 -3.636 0.000278 ***
## leagueSerie A:season -8.291e-02 1.200e-02 -6.911 5.14e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9838 on 9276 degrees of freedom
## Multiple R-squared: 0.5122, Adjusted R-squared: 0.5108
## F-statistic: 389.5 on 25 and 9276 DF, p-value: < 2.2e-16
# Check variance inflation factor (VIF) for multicollinearity
library(car)
vif(interaction_model)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
## GVIF Df GVIF^(1/(2*Df))
## log_market_value 1.217175e+00 1 1.103257
## short_pos 1.089866e+00 15 1.002873
## league 1.274360e+23 4 772.968011
## season 6.133796e+00 1 2.476650
## league:season 1.274144e+23 4 772.951660
# Add predictions and actual values to the dataset
filtered_data$predictions <- exp(predict(interaction_model))
filtered_data$actual_values <- filtered_data$fee
# Visualize predictions vs. actual values
ggplot(filtered_data, aes(x = actual_values, y = predictions, color = league)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "red") +
labs(title = "Predictions vs. Actual Transfer Fees",
x = "Actual Transfer Fee",
y = "Predicted Transfer Fee",
color = "League")
# Check residuals
residuals <- residuals(interaction_model)
# Visualize residuals
ggplot(filtered_data, aes(x = log_market_value, y = residuals, color = league)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residuals vs. Log-transformed Market Value",
x = "Log-transformed Market Value",
y = "Residuals",
color = "League") +
facet_grid(league ~ .)
# Evaluate model performance
library(Metrics)
rmse_value <- rmse(filtered_data$actual_values, filtered_data$predictions)
cat("Root Mean Squared Error (RMSE):", rmse_value, "\n")
## Root Mean Squared Error (RMSE): 8735957
AICs
model <- lm(fee ~ market_value + league, data = filtered_data)
model1 <- lm(fee ~ market_value + short_pos + league + season, data = filtered_data)
model2 <- lm(fee ~ market_value + short_pos, data = filtered_data)
model3 <- lm(fee ~ market_value + short_pos + league * season, data = filtered_data)
model_age_cat <- lm(fee ~ market_value + short_pos + league + season + as.factor(age_category), data = filtered_data)
model_interaction <- lm(fee ~ market_value * short_pos * league * season * as.factor(age_category), data = filtered_data)
interaction_model <- lm(log_fee ~ log_market_value + short_pos + league * season, data = filtered_data)
# AIC values
AIC(model, model1, model2, model3, model_age_cat, model_interaction, interaction_model)
## df AIC
## model 7 316192.85
## model1 23 316198.45
## model2 18 316334.64
## model3 27 316193.24
## model_age_cat 27 315901.87
## model_interaction 949 312993.23
## interaction_model 27 26121.24
correlation_coefficient <- cor(filtered_data$market_value, filtered_data$season)
# Print the correlation coefficient
cat("Correlation Coefficient:", correlation_coefficient, "\n")
## Correlation Coefficient: 0.1888159
#model 1 is the lowest
# if i were to suspect interaction it would be between position and season
model1 <- lm(fee ~ market_value + position_category + league + season, data = filtered_data)
lm_all <- lm(fee ~ market_value + position_category + league + season + age, data = filtered_data)
lm_age_val <- lm(fee ~ market_value*age + position_category + league + season, data = filtered_data)
lm_age_val2 <- lm(fee ~ market_value*age + position_category + league*season, data = filtered_data)
AIC(model1, lm_all, lm_age_val,lm_age_val2)
## df AIC
## model1 11 316183.5
## lm_all 12 315878.7
## lm_age_val 13 315631.2
## lm_age_val2 17 315623.2
ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
geom_point() +
geom_abline(intercept = coef(lm_all)[1], slope = coef(lm_all)[2], color = "purple") +
labs(title = "Relationship between Market Value and Transfer Fee (All League)",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
lm_age_val <- lm(fee ~ market_value*age + position_category + league + season, data = filtered_data)
ggplot(filtered_data, aes(x = market_value, y = fee, color = position_category)) +
geom_point() +
geom_abline(intercept = coef(lm_age_val)[1], slope = coef(lm_age_val)[2], color = "purple") +
labs(title = "Relationship between Market Value and Transfer Fee (All League)",
x = "Market Value",
y = "Transfer Fee",
color = "League") +
facet_grid(league ~ .)
# Create a GAM model with a nonlinear term in age and market_value
gam_model <- gam(fee ~ s(market_value) + s(age) + position_category + league + s(season),
data = filtered_data)
# Plot the GAM model
plot(gam_model, select = 2) # Select the age smooth term for plotting
# Summary of the GAM model
summary(gam_model)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## fee ~ s(market_value) + s(age) + position_category + league +
## s(season)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6069370 161602 37.558 < 2e-16 ***
## position_categorydefence 170217 143109 1.189 0.23430
## position_categoryGK 183240 249089 0.736 0.46197
## position_categorymidfield -455786 144296 -3.159 0.00159 **
## leagueLaliga 1156746 204788 5.649 1.67e-08 ***
## leagueLigue 1 994962 196506 5.063 4.20e-07 ***
## leaguePremier League 2565210 193447 13.261 < 2e-16 ***
## leagueSerie A 747983 176323 4.242 2.24e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(market_value) 8.959 8.999 3438.57 <2e-16 ***
## s(age) 4.712 5.742 66.02 <2e-16 ***
## s(season) 8.041 8.776 33.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.791 Deviance explained = 79.2%
## GCV = 3.006e+13 Scale est. = 2.9964e+13 n = 9302
filtered_data$predicted_fee <- predict(gam_model, newdata = filtered_data)
# Plot the observed data and the fitted values
ggplot(filtered_data, aes(x = market_value, y = fee, color = league)) +
geom_point(alpha = 1) + # Scatter plot of observed data
geom_line(aes(y = predicted_fee), color = "purple", size = 1) + # Fitted values from the GAM model
labs(title = "Observed Data vs. Fitted Values (GAM Model)",
x = "Market Value",
y = "Transfer Fee")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.